library(tidyverse) # for general analysis
library(fpp)
library(fpp3) # for predictions 
library(ggthemes) # for beautiful themes
library(TSA) # for TS analysis
library(kableExtra) # for beautiful tables
library(lubridate) # for time data
library(tsibble) # for tsiblle data format
library(caret) # for modeling
library(stats)
library(ggplot2)
library(MuMIn)
library(Metrics)
library(tstools)

# Set a theme
theme_set(theme_minimal())

Exploratory Data Analysis

Data is imported and transformed into a time series object.

sales_month <- read_csv("Zillow monthly raw data (full).csv") #import raw data file

chicago_sales_m <- sales_month %>% filter(RegionName == "Chicago, IL") #import Chicago data only 

chicago_sales_m <- chicago_sales_m %>% #select median price and data only 
                 gather(key = "Date", value = "Price", -RegionID:-StateName) %>%
                 select(Date, Price) %>%
                 rename(Median_price = Price)

chicago_sales_m$Date <- mdy(chicago_sales_m$Date)

dim(chicago_sales_m)
## [1] 151   2
chi_med <- ts(chicago_sales_m$Median_price, start=c(2008,2), frequency=12)

There are two distinctive trends - a negative trend during and in the wake of the 2008 financial crisis (2008-2013), followed by a positive trend until the present. There is also clear additive seasonality. However, this seasonal trend has been impacted by the current COVID-19 pandemic.

autoplot(chi_med, main="Chicago: Median Home Sale Price")

Omitted COVID-19 from time series due to difficulty predicting values in earlier analysis. It also allows us to work with 12 complete years of data.

chi_median <- ts(chi_med, start=c(2008,2), end=c(2020,1), frequency=12)
autoplot(chi_median, main="Chicago: Median Home Sale Price")

Data is complete, and in appropriate form.

sum(is.na(chi_median)) #check for null values
## [1] 0
frequency(chi_median) #check correct frequency of time series
## [1] 12
cycle(chi_median) #check appropriate structure of time series 
##      Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 2008       2   3   4   5   6   7   8   9  10  11  12
## 2009   1   2   3   4   5   6   7   8   9  10  11  12
## 2010   1   2   3   4   5   6   7   8   9  10  11  12
## 2011   1   2   3   4   5   6   7   8   9  10  11  12
## 2012   1   2   3   4   5   6   7   8   9  10  11  12
## 2013   1   2   3   4   5   6   7   8   9  10  11  12
## 2014   1   2   3   4   5   6   7   8   9  10  11  12
## 2015   1   2   3   4   5   6   7   8   9  10  11  12
## 2016   1   2   3   4   5   6   7   8   9  10  11  12
## 2017   1   2   3   4   5   6   7   8   9  10  11  12
## 2018   1   2   3   4   5   6   7   8   9  10  11  12
## 2019   1   2   3   4   5   6   7   8   9  10  11  12
## 2020   1
min(chi_median) #$149,000, which correspondts to February 2013
## [1] 149000
max(chi_median) #$258,500 which corresponds to July 2008
## [1] 258500

Decomposition of the time series validates initial impression of the data. However there appears to be more than one seasonal trend, evidenced by the “seasonal” and “random” decompsitions.

plot(decompose(chi_median))

Median price peaks during the summer months. The range between median prices is also narrower in the spring/summer months than in the rest of the year.

boxplot(chi_median~cycle(chi_median), xlab="Cycle (Month)", ylab="Dollars", main="Chicago: Median Home Sales Price")

Both the ACF and PACF decay sinusodially slowly over time, indicating seasonality and a significant linear relationship between the series and its lags.

acf(chi_median, main="ACF Chicago Median Home Sales", lag.max=50)

pacf(chi_median, main="PACF Chicago Median Home Sales", lag.max=50)

Looking at the Raw Periodogram, we can see that there are many spikes and therefore many dominating frequencies. The periodgram has two spikes, which will be most important to the overall signal.

spectrum(chi_median)

periodogram(chi_median)

Create train/test split for modelling.

chi_train<-window(chi_median, end = 2019.05) #end train in Jan 2019
chi_test<-window(chi_median, start = 2019.05) #use last year of dataset as test

#double-check time series structure
frequency(chi_train) 
## [1] 12
cycle(chi_train) 
##      Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 2008       2   3   4   5   6   7   8   9  10  11  12
## 2009   1   2   3   4   5   6   7   8   9  10  11  12
## 2010   1   2   3   4   5   6   7   8   9  10  11  12
## 2011   1   2   3   4   5   6   7   8   9  10  11  12
## 2012   1   2   3   4   5   6   7   8   9  10  11  12
## 2013   1   2   3   4   5   6   7   8   9  10  11  12
## 2014   1   2   3   4   5   6   7   8   9  10  11  12
## 2015   1   2   3   4   5   6   7   8   9  10  11  12
## 2016   1   2   3   4   5   6   7   8   9  10  11  12
## 2017   1   2   3   4   5   6   7   8   9  10  11  12
## 2018   1   2   3   4   5   6   7   8   9  10  11  12
## 2019   1
frequency(chi_test)
## [1] 12
cycle(chi_test) 
##      Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 2019       2   3   4   5   6   7   8   9  10  11  12
## 2020   1

Holt-Winters Seasonal Method

The Holt-Winters Seasonal Method was selected based on its appropriateness for time series with both a trend and seasonality. Two additive HW models were created - one with and one without damping.

The first model, which does not include damping, does not completely capture the autocorrelation in the time series. There remains significant autocorellation in the residuals, and the residuals are not normally distributed.

HW1 <- hw(chi_train, seasonal = "additive",h=12, damped = FALSE) 
plot(HW1)

checkresiduals(HW1)

## 
##  Ljung-Box test
## 
## data:  Residuals from Holt-Winters' additive method
## Q* = 61.568, df = 8, p-value = 2.293e-10
## 
## Model df: 16.   Total lags used: 24

Adding damping does not improve the model. The residuals do not resemble white noise.

HW2 <- hw(chi_train, seasonal = "additive",h=12, damped = TRUE) 
plot(HW2)

checkresiduals(HW2)

## 
##  Ljung-Box test
## 
## data:  Residuals from Damped Holt-Winters' additive method
## Q* = 60.547, df = 7, p-value = 1.174e-10
## 
## Model df: 17.   Total lags used: 24

sArima

From the EDA, we know that there is significant seasonality in the data, suggesting that a sArima model might be appropriate. We begin exploring ways to make the data stationary.

We first check if a Box-Cox Transformation is necessary, and find that lambda should be set to 1.999 (rounded to 2 for simplicity). Once the Box-Cox Transformation is completed, a KPSS Test for stationarity is completed. The series is not stationary.

BoxCox.lambda(chi_train) # lambda = 1.999924, significantly different from 1
## [1] 1.999924
lambda <- 2 # round up lambda value to 2 for simplicity 
transformed <- BoxCox(chi_train, lambda=lambda) # BoxCox Transformation
kpss.test(transformed) # p=0.03, data post-Box-Cox transformation is not stationary 
## 
##  KPSS Test for Level Stationarity
## 
## data:  transformed
## KPSS Level = 0.55011, Truncation lag parameter = 4, p-value = 0.03038

The time series becomes stationary after 1st order differencing.

differenced <- diff(transformed) #difference transformed data
kpss.test(differenced) # KPSS test - p=0.1 > 0.05, data is stationary after Box-Cox transformation and 1 round of differencing 
## Warning in kpss.test(differenced): p-value greater than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  differenced
## KPSS Level = 0.096927, Truncation lag parameter = 4, p-value = 0.1
tsdisplay(differenced, lag=50) #visualize new dataset 

periodogram(differenced) #uncertain of if we only look at the periodgram for raw data

A sArima model is built using auto.arima, with differencing=1 and lambda=2. This results in an ARIMA(1,1,0)(0,1,2)[12] model. The p-value for the Ljung-Box test is 0.05097. This is on the border of being statistically significant, assuming we use a 0.05 benchmark. However, 0.05 is arbitrary and for the purpose of this project, we believe that p=0.051 is an acceptable indicator that the residuals resemble white noise.

AR1 <-  auto.arima(chi_train, d=1, lambda=lambda)
summary(AR1)
## Series: chi_train 
## ARIMA(1,1,0)(0,1,2)[12] 
## Box Cox transformation: lambda= 2 
## 
## Coefficients:
##           ar1     sma1     sma2
##       -0.3102  -0.2970  -0.2017
## s.e.   0.0971   0.1126   0.1121
## 
## sigma^2 estimated as 1.049e+18:  log likelihood=-2637.67
## AIC=5283.34   AICc=5283.69   BIC=5294.46
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 1069.989 4941.647 3764.568 0.5665218 1.917839 0.2884911
##                     ACF1
## Training set 0.008751159
checkresiduals(AR1)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,0)(0,1,2)[12]
## Q* = 32.59, df = 21, p-value = 0.05097
## 
## Model df: 3.   Total lags used: 24
plot(forecast(AR1))

Further exploration is necessary to see if a superior sArima model can be built.

eacf(chi_train)
## AR/MA
##   0 1 2 3 4 5 6 7 8 9 10 11 12 13
## 0 x x x x x x x x x x x  x  x  x 
## 1 x x o o x x x o o x x  x  x  x 
## 2 x o x o o o x o o o o  x  o  o 
## 3 x o o o o o o o o o o  x  x  o 
## 4 o x x o o o o o o o o  x  o  o 
## 5 x o o x o o o o o o o  x  x  x 
## 6 x x o x x o o o o o o  x  x  x 
## 7 x x o x o o o o o o o  x  x  x

We experiment with different combinations of p=1/2, q=1/2, and P=0/1. However, none of the models pass the Ljung-Box Test for stationarity.

AR2 <-  Arima(chi_train, order=c(1,1,1), seasonal=c(0,1,2),lambda=lambda) #p=0.03
summary(AR2)
## Series: chi_train 
## ARIMA(1,1,1)(0,1,2)[12] 
## Box Cox transformation: lambda= 2 
## 
## Coefficients:
##           ar1     ma1     sma1     sma2
##       -0.4387  0.1431  -0.3083  -0.2036
## s.e.   0.1876  0.1990   0.1128   0.1115
## 
## sigma^2 estimated as 1.052e+18:  log likelihood=-2637.42
## AIC=5284.84   AICc=5285.37   BIC=5298.73
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 1051.221 4921.178 3745.168 0.5573495 1.907905 0.2870044
##                      ACF1
## Training set -0.005981719
checkresiduals(AR2)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,1)(0,1,2)[12]
## Q* = 33.497, df = 20, p-value = 0.02973
## 
## Model df: 4.   Total lags used: 24
AR3 <-  Arima(chi_train, order=c(1,1,0), seasonal=c(1,1,2),lambda=lambda) #p=0.03
summary(AR3)
## Series: chi_train 
## ARIMA(1,1,0)(1,1,2)[12] 
## Box Cox transformation: lambda= 2 
## 
## Coefficients:
##           ar1     sar1    sma1     sma2
##       -0.3211  -0.7318  0.5002  -0.4931
## s.e.   0.0978   0.1078  0.7083   0.3563
## 
## sigma^2 estimated as 9.508e+17:  log likelihood=-2636.06
## AIC=5282.12   AICc=5282.65   BIC=5296.01
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE     MAPE      MASE       ACF1
## Training set 962.8262 4629.704 3584.713 0.5117101 1.824064 0.2747082 0.01163099
checkresiduals(AR3)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,0)(1,1,2)[12]
## Q* = 32.984, df = 20, p-value = 0.03388
## 
## Model df: 4.   Total lags used: 24
AR4 <-  Arima(chi_train, order=c(1,1,1), seasonal=c(1,1,1),lambda=lambda) #p=0.02
summary(AR4)
## Series: chi_train 
## ARIMA(1,1,1)(1,1,1)[12] 
## Box Cox transformation: lambda= 2 
## 
## Coefficients:
##           ar1     ma1    sar1     sma1
##       -0.4390  0.1464  0.2773  -0.6399
## s.e.   0.1912  0.2030  0.2045   0.1746
## 
## sigma^2 estimated as 1.068e+18:  log likelihood=-2638.14
## AIC=5286.28   AICc=5286.81   BIC=5300.18
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 1032.062 4937.399 3741.426 0.5457746 1.902978 0.2867176
##                     ACF1
## Training set -0.00711367
checkresiduals(AR4)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,1)(1,1,1)[12]
## Q* = 34.998, df = 20, p-value = 0.02011
## 
## Model df: 4.   Total lags used: 24
AR5 <-  Arima(chi_train, order=c(2,1,0), seasonal=c(0,1,1),lambda=lambda) #p=0.01
summary(AR5)
## Series: chi_train 
## ARIMA(2,1,0)(0,1,1)[12] 
## Box Cox transformation: lambda= 2 
## 
## Coefficients:
##           ar1     ar2     sma1
##       -0.2720  0.0935  -0.4252
## s.e.   0.1059  0.0985   0.1200
## 
## sigma^2 estimated as 1.073e+18:  log likelihood=-2638.82
## AIC=5285.64   AICc=5285.99   BIC=5296.75
## 
## Training set error measures:
##                    ME     RMSE     MAE       MPE     MAPE     MASE        ACF1
## Training set 901.3052 4925.984 3739.03 0.4798191 1.896996 0.286534 -0.02840107
checkresiduals(AR5)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,1,0)(0,1,1)[12]
## Q* = 37.589, df = 21, p-value = 0.01439
## 
## Model df: 3.   Total lags used: 24

Next, we use the ARIMA(1,1,0)(0,1,2)[12] model to forecast results for the next year. The “Actual vs. Predicted” chart shows that the model does a decent job of forecasting values pre-COVID, but is not capable of accurately predicted the trend for the past few months.

#model 1
AR1_fcast <- forecast(AR1, h=12)
plot(AR1_fcast, chi_test)

tsplot(cbind(chi_test, AR1_fcast$mean), plot_title="Actual vs. Predicted")

mae(AR1_fcast$mean, chi_test)
## [1] 4181.585
rmse(AR1_fcast$mean, chi_test)
## [1] 4559.497

Next we do cross-validation to understand how the model’s performance evolves over time.

k <- 72 # minimum observations (6 years)
n <- length(chi_median) # Number of data points
p <- 12 # Period
H <- 12 # Forecast horizon

defaultW <- getOption("warn") 
options(warn = -1)

st <- tsp(chi_median)[1]+(k-2)/p #  gives the start time in time units,

error_AR1 <- matrix(NA,n-k,H) # One-year forecast horizon error for each window

AICc_AR1 <- matrix(NA,n-k) # Estimated model AICc value for each wndow

for(i in 1:(n-k))

{
  
  # rolling window
  train <- window(chi_median, start=st+(i-k+1)/p, end=st+i/p) ## Window Length: k
  
  test <- window(chi_median, start=st + (i+1)/p, end=st+(i+H)/p) ## Window Length: H
  
  #Arima Models
  
  AR12 <-  Arima(train, order=c(1,1,0), seasonal=list(order=c(0,1,2), period=p), lambda='auto', method='ML') # ARIMA(1,1,0)(0,1,2)[12]

  fcast_AR1 <- forecast(AR12, h=H)

  # Error & AICc

  error_AR1[i,1:length(test)] <- fcast_AR1[['mean']]-test

  AICc_AR1[i] <- AR12$aicc
}
MAE_AR1 <- colMeans(abs(error_AR1), na.rm=TRUE)
RMSE_AR1 <- sqrt(colMeans(error_AR1**2, na.rm=TRUE))

plot(1:12, MAE_AR1, type="l",col=3,xlab="Horizon", ylab="Error", ylim=c(3600,8000))
lines(1:12, RMSE_AR1, type="l",col=2)
legend("topleft",legend=c("AR1 - MAE", "AR - RMSE"),col=1:4,lty=1)

plot(1:72, AICc_AR1, type="l",col=1,xlab="Iterations", ylab="AICc", ylim=c(750,2650))
legend("bottomright",legend="AR1 - AICc",col=1:4,lty=1)

Regression with Arima errors

As previously mentioned, the sArima model was not able to capture the changed environment due to COVID. Therefore, we will try Regression with Arima errors, in order to include other leading predictors.

We tried Google Trends data for two search terms: “homes for sale” and “realtor”.

First, we import the datasets and double-check dimensions.

homes <- read_csv("home for sale.csv") #Google Trends "home for sale" Data 
realtor <- read_csv("realtor.csv") #Google Trends "realtor" Data 

homes$Month <- yearmonth(homes$Month) 
realtor$Month <- yearmonth(realtor$Month) 

dim(homes)
## [1] 151   2
dim(realtor)
## [1] 151   2
sum(is.na(homes))
## [1] 0
sum(is.na(realtor))
## [1] 0

Store as TS object

homes_ts <- ts(homes$`Home for sale`, frequency=12, start=c(2008,2), end=c(2020,1))
realtor_ts <- ts(realtor$Realtor, frequency=12, start=c(2008,2), end=c(2020,1))

Visualize the median sale price (scaled) dataset and the two Google Trends datasets. They show similar seasonality, suggesting that they might be appropriate predictors.

chi_median_scale <- chi_median/1000
tsplot(cbind(chi_median_scale, homes_ts, realtor_ts), plot_title="Trends")

Train/test split

home_train<-window(homes_ts, end = 2019.05) #end train in August 2019
home_test<-window(homes_ts, start = 2019.05) #use last year of dataset as test

realtor_train<-window(realtor_ts, end = 2019.05) #end train in August 2019
realtor_test<-window(realtor_ts, start = 2019.05) #use last year of dataset as test

Transform realtor data with a lambda value of 0.3 prior to fitting the regression model.

lambda_realtor <- BoxCox.lambda(realtor_train) # lambda = 1.87, significantly different from 1
transformed_realtor <- BoxCox(realtor_train, lambda=lambda_realtor) # BoxCox Transformation

The regression model using the transformed_realtor dataset does not pass the Ljung-Box test for stationarity (p=0.3587).

RAR1 <- auto.arima(y=chi_train, lambda=lambda, xreg=transformed_realtor) #auto.arima model 
summary(RAR1) # b) summary 
## Series: chi_train 
## Regression with ARIMA(1,1,0)(0,1,2)[12] errors 
## Box Cox transformation: lambda= 2 
## 
## Coefficients:
##           ar1     sma1     sma2      xreg
##       -0.3116  -0.3106  -0.2004  425096.3
## s.e.   0.0966   0.1136   0.1126  408400.9
## 
## sigma^2 estimated as 1.047e+18:  log likelihood=-2637.13
## AIC=5284.25   AICc=5284.78   BIC=5298.15
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 1062.713 4907.453 3735.484 0.5620696 1.901453 0.2862623
##                       ACF1
## Training set -0.0007944304
checkresiduals(RAR1) # c) check residuals 

## 
##  Ljung-Box test
## 
## data:  Residuals from Regression with ARIMA(1,1,0)(0,1,2)[12] errors
## Q* = 32.757, df = 20, p-value = 0.03587
## 
## Model df: 4.   Total lags used: 24

The “home for sale” dataset did not need to be transformed.

lambda_home <- BoxCox.lambda(home_train) # lambda = 0.43, similar to 1 so won't transform
transformed_home <- BoxCox(home_train, lambda=lambda_home) # BoxCox Transformation

Similar to the other regression model, there remains significant autocrrelation in the residuals for this model as well.

RAR2 <- auto.arima(y=chi_train, lambda=lambda, xreg=transformed_home) #auto.arima model 
summary(RAR2) # b) summary 
## Series: chi_train 
## Regression with ARIMA(0,1,0)(0,1,0)[12] errors 
## Box Cox transformation: lambda= 2 
## 
## Coefficients:
##            xreg
##       160586165
## s.e.   93761414
## 
## sigma^2 estimated as 1.35e+18:  log likelihood=-2652.29
## AIC=5308.58   AICc=5308.68   BIC=5314.14
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE     MAPE      MASE       ACF1
## Training set 493.8748 5724.653 4172.448 0.2591168 2.136138 0.3197482 -0.3611121
checkresiduals(RAR2) # c) check residuals 

## 
##  Ljung-Box test
## 
## data:  Residuals from Regression with ARIMA(0,1,0)(0,1,0)[12] errors
## Q* = 67.876, df = 23, p-value = 2.576e-06
## 
## Model df: 1.   Total lags used: 24

Save as TS object, do train/test split

gtrends <- cbind(homes, realtor$Realtor)
gtrends_ts <- ts(gtrends, frequency=12, start=c(2008,2), end=c(2020,1))
gtrends_ts <- cbind(gtrends_ts[,'Home for sale'], gtrends_ts[,'realtor$Realtor'])

gtrends_train<-window(gtrends_ts, end = 2019.05) #end train in August 2019
gtrends_test<-window(gtrends_ts, start = 2019.05) #use last year of dataset as test

As with other regression models, there remains significant autocorrelation in the residuals. Will experiment with other predictors later time permitting.

RAR3 <- auto.arima(y=chi_train, lambda=lambda, xreg=gtrends_train) #auto.arima model 
summary(RAR3) # b) summary 
## Series: chi_train 
## Regression with ARIMA(0,1,0)(0,1,0)[12] errors 
## Box Cox transformation: lambda= 2 
## 
## Coefficients:
##       gtrends_ts[, "Home for sale"]  gtrends_ts[, "realtor$Realtor"]
##                            15819071                         10641279
## s.e.                       10454873                         14885032
## 
## sigma^2 estimated as 1.358e+18:  log likelihood=-2652.13
## AIC=5310.26   AICc=5310.47   BIC=5318.59
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE     MAPE      MASE      ACF1
## Training set 486.6505 5714.013 4162.284 0.2552087 2.129803 0.3189694 -0.365891
checkresiduals(RAR3) # c) check residuals 

## 
##  Ljung-Box test
## 
## data:  Residuals from Regression with ARIMA(0,1,0)(0,1,0)[12] errors
## Q* = 69.758, df = 22, p-value = 7.219e-07
## 
## Model df: 2.   Total lags used: 24

Dynamic Harmonic Regression

Recall Periodgram points to two major spikes.

temp <- periodogram(chi_train)

spec_p <- matrix(round(temp$spec/1e5, 2))
freq_p <- matrix(temp$freq)
cbind(spec_p, freq_p) # two major spikes at 0.00625 and 0.08125
##            [,1]        [,2]
##  [1,] 506115.69 0.007407407
##  [2,]  35676.94 0.014814815
##  [3,]   1144.62 0.022222222
##  [4,]   8029.70 0.029629630
##  [5,]  10025.13 0.037037037
##  [6,]   7849.82 0.044444444
##  [7,]   5659.41 0.051851852
##  [8,]   3956.29 0.059259259
##  [9,]  12754.36 0.066666667
## [10,]   2951.17 0.074074074
## [11,] 159940.23 0.081481481
## [12,]  49147.41 0.088888889
## [13,]    858.39 0.096296296
## [14,]    148.28 0.103703704
## [15,]    226.85 0.111111111
## [16,]    389.89 0.118518519
## [17,]    489.30 0.125925926
## [18,]    305.07 0.133333333
## [19,]   1024.03 0.140740741
## [20,]    404.58 0.148148148
## [21,]   1523.30 0.155555556
## [22,]   2566.60 0.162962963
## [23,]   8394.61 0.170370370
## [24,]    264.95 0.177777778
## [25,]    616.84 0.185185185
## [26,]    772.83 0.192592593
## [27,]    277.76 0.200000000
## [28,]    376.47 0.207407407
## [29,]    236.36 0.214814815
## [30,]    590.41 0.222222222
## [31,]    526.37 0.229629630
## [32,]    114.09 0.237037037
## [33,]   1229.78 0.244444444
## [34,]   2064.65 0.251851852
## [35,]    106.71 0.259259259
## [36,]    656.63 0.266666667
## [37,]     53.24 0.274074074
## [38,]    205.91 0.281481481
## [39,]    150.02 0.288888889
## [40,]    379.36 0.296296296
## [41,]    196.86 0.303703704
## [42,]    189.96 0.311111111
## [43,]    415.47 0.318518519
## [44,]    216.71 0.325925926
## [45,]   1990.78 0.333333333
## [46,]     72.82 0.340740741
## [47,]    353.82 0.348148148
## [48,]    357.78 0.355555556
## [49,]    798.82 0.362962963
## [50,]    242.78 0.370370370
## [51,]     41.58 0.377777778
## [52,]    450.60 0.385185185
## [53,]    369.11 0.392592593
## [54,]    313.05 0.400000000
## [55,]    140.36 0.407407407
## [56,]    638.75 0.414814815
## [57,]    339.75 0.422222222
## [58,]     64.48 0.429629630
## [59,]    543.94 0.437037037
## [60,]    109.46 0.444444444
## [61,]    198.45 0.451851852
## [62,]     23.81 0.459259259
## [63,]    389.30 0.466666667
## [64,]    827.45 0.474074074
## [65,]    173.57 0.481481481
## [66,]    189.30 0.488888889
## [67,]    674.16 0.496296296
(1/0.00741)/12 # once every 11 years?? 
## [1] 11.24606
(1/0.08148)/12 # once every year?? 
## [1] 1.022746

We create a regression model using a Fourier transformation (K=2) as the predictor, and Arima errors. This results in an ARIMA(0,2,3)(1,0,0)[12] model. The residuals are stationary and relatively normally distributed, signaling that they are white noise (p=0.09553).

DHR1 <-  auto.arima(chi_train, xreg=fourier(chi_train, K=2)) # in example seasonal=FALSE, not sure if that should be the setting here? 
summary(DHR1)
## Series: chi_train 
## Regression with ARIMA(0,2,3)(1,0,0)[12] errors 
## 
## Coefficients:
##           ma1     ma2      ma3    sar1     S1-12       C1-12      S2-12
##       -1.2942  0.5532  -0.2389  0.6197  1738.008  -15309.818  -2878.576
## s.e.   0.0843  0.1600   0.1190  0.0724  2275.337    2278.924   1003.122
##          C2-12
##       1525.533
## s.e.   997.508
## 
## sigma^2 estimated as 25399909:  log likelihood=-1292.61
## AIC=2603.21   AICc=2604.71   BIC=2629.02
## 
## Training set error measures:
##                    ME    RMSE      MAE      MPE     MAPE     MASE        ACF1
## Training set 620.9367 4845.17 3818.003 0.293618 1.914033 0.292586 -0.02108458
checkresiduals(DHR1)

## 
##  Ljung-Box test
## 
## data:  Residuals from Regression with ARIMA(0,2,3)(1,0,0)[12] errors
## Q* = 23.732, df = 16, p-value = 0.09553
## 
## Model df: 8.   Total lags used: 24

This model is less accurate than the sArima model used for prediction earlier.

#model 2

DHR1_fcast <- forecast(DHR1, h=12, xreg=fourier(chi_train, K=2, h=12))
plot(DHR1_fcast, chi_test)

tsplot(cbind(chi_test, DHR1_fcast$mean), plot_title="Actual vs. Predicted")

mae(DHR1_fcast$mean, chi_test)
## [1] 5797.028
rmse(DHR1_fcast$mean, chi_test)
## [1] 6226.299

Intervention

Will come back to this

#min(chi_median) Recall minimum ($149,000) correspondts to February 2013 (61st observation)
acf(chi_median[0:61])

pacf(chi_median[0:61])

eacf(chi_median[0:61])
## AR/MA
##   0 1 2 3 4 5 6 7 8 9 10 11 12 13
## 0 x x x x x x x x x x x  x  x  o 
## 1 o o o o o o x o o o o  x  o  o 
## 2 x o o o o o x o o o o  o  o  o 
## 3 o x o o o o x o o o o  o  o  o 
## 4 x o o o o o o o o o o  o  o  o 
## 5 x o o o o o o o o o o  o  o  o 
## 6 x o o o o o o o o o o  x  o  o 
## 7 x o o o o o o o o o o  o  o  o
acf(chi_median[62:144])

pacf(chi_median[62:144])

eacf(chi_median[62:144])
## AR/MA
##   0 1 2 3 4 5 6 7 8 9 10 11 12 13
## 0 x x x x o o o x x x x  x  x  x 
## 1 x x x o x x o o o x x  x  x  o 
## 2 o o x o o o o o o o o  x  o  o 
## 3 x o x o o o o o o o o  x  x  o 
## 4 o o x o o o o o o o o  x  o  o 
## 5 x x o o o o o o o o o  x  x  o 
## 6 x x o o x o o o o o o  x  x  x 
## 7 x x o o o o o o o o o  x  o  o
auto.arima(chi_median[0:61], lambda='auto')
## Series: chi_median[0:61] 
## ARIMA(0,1,0) 
## Box Cox transformation: lambda= 1.999924 
## 
## sigma^2 estimated as 2.685e+18:  log likelihood=-1358.16
## AIC=2718.33   AICc=2718.4   BIC=2720.42
#arimax(y,order=c(1,0,1), xreg=data.frame(x=c(rep(0,200),1:200)))
#xreg=data.frame(x=c(rep(0,200),1:200)))

#?arimax

#xreg = data.frame(x=c(rep(0,61),chi_median[62:144]))

#xreg

#chi_median